The purpose of this project is to examine the dependencies between the situation in Afghanistan and tweets concerned Biden and Trump in August 2021 based on NLP techniques.
import snscrape.modules.twitter as sntwitter
import pandas as pd
import numpy as np
from datetime import datetime
import re
import string
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langdetect import detect
from google_trans_new import google_translator
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
pd.options.display.max_rows = 100
Versions:
#print('\n'.join(f'{m.__name__} == {m.__version__}' for m in globals().values() if getattr(m, '__version__', None)))
Due to the date on which I started this project (27.08.2021) and the period which I would like to examine (since 01.08.2021), I had to use a library that allows scraping historical tweets. I decided to use snscrape. I noticed that the number of tweets scraping that way (using that library) is significantly smaller than the number of tweets scraped on an ongoing basis (another library). However, after checking several options, I was under the impression that my choices were severely restricted. So I gave snscrape a chance while being aware of limitations - bias and variance.
biden_tweets = []
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('#Biden since:2021-08-01 until:2021-08-27').get_items()):
if i > 1_000_000:
break
biden_tweets.append([tweet.date, tweet.content])
biden = pd.DataFrame(biden_tweets, columns = ['Datetime', 'Text'])
#biden.to_csv("Biden.csv", encoding = 'utf-8', index=False)
trump_tweets = []
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('#Trump since:2021-08-01 until:2021-08-27').get_items()):
if i > 1_000_000:
break
trump_tweets.append([tweet.date, tweet.content])
trump = pd.DataFrame(trump_tweets, columns = ['Datetime', 'Text'])
#trump.to_csv("Trump.csv", encoding = 'utf-8', index=False)
The next step was cleaning the data. Unexpectedly, the Biden dataset included erroneously scraped data (the dates were not the dates), and some rows caused problems later for both Biden and Trump dataset so that rows were deleted from the initial datasets. The rest of the rows were cleaned by removing emojis, http(s) addresses, punctuation marks. The texts were reduced to lower case and joined hashtags (#KabulAiport) were divided (kabul airport).
d_parser = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S%z')
biden = pd.read_csv("Biden.csv", sep = ',', engine = 'python')
trump = pd.read_csv("Trump.csv", sep = ',', engine='python', parse_dates = ['Datetime'], date_parser = d_parser)
duplicateRowsDF = biden[biden.duplicated()]
second_rows = duplicateRowsDF.index
first_rows = second_rows-1
all_rows = second_rows.append(first_rows)
biden = biden.drop([10223, 10224, 33518, 33519, 34884, 45540, 51911,
56054, 60112, 66129, 66734, 83760, 97736, 108435]).drop(all_rows).reset_index()
biden["Datetime"] = pd.to_datetime(biden["Datetime"], format = '%Y-%m-%d %H:%M:%S%z')
trump = trump.drop([31488]).reset_index()
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
"]+", flags=re.UNICODE)
def clean_text(text):
text = re.sub('(https?:\/\/[\w\d\.\/]*)', '', text) # http(s) addresses
text = re.sub('[\.,\?!⁉️‼@#⁉️\(\):;&‘’/“”…-'']', '', text) # punctuation
text = re.sub(r'([A-Z][a-z]+|([a-z] A)[A-Z])', r'\1 ', text) # divide hashtags
text = text.lower()
text = re.sub('\w*\d\w*', '', text) # delete words with digits
text = re.sub('\n', ' ', text)
text = emoji_pattern.sub(r'', text)
text = re.sub("[%s]" % re.escape(string.punctuation), '', text) # punctuation
return text
clean = lambda x: clean_text(x)
biden_clean = pd.DataFrame(biden['Text'].apply(clean))
biden_clean['Datetime'] = biden['Datetime']
trump_clean = pd.DataFrame(trump['Text'].apply(clean))
trump_clean['Datetime'] = trump['Datetime']
The first two cleaned rows for Biden and Trump datasets:
for i in range(0,2):
print("-------------------------------------------------- BIDEN original --------------------------------------------------\n\n",
biden['Text'][i],
"\n\n-------------------------------------------------- BIDEN cleaned --------------------------------------------------\n\n",
biden_clean['Text'][i], "\n")
print("-------------------------------------------------- TRUMP original --------------------------------------------------\n\n",
trump['Text'][i],
"\n\n-------------------------------------------------- TRUMP cleaned --------------------------------------------------\n\n",
trump_clean['Text'][i], "\n")
-------------------------------------------------- BIDEN original -------------------------------------------------- Saw the Biden speech.. Is US in the midst of a massive PR ‘Spin’ exercise to salvage humongous loss of reputation by changing the narrative??! Hmmm! #KabulAiport #WagTheDog #ChangeTheNarrative #Biden #Afghanishtan #Kabul -------------------------------------------------- BIDEN cleaned -------------------------------------------------- saw the biden speech is us in the midst of a massive pr spin exercise to salvage humongous loss of reputation by changing the narrative hmmm kabul aiport wag the dog change the narrative biden afghanishtan kabul -------------------------------------------------- TRUMP original -------------------------------------------------- Where’s Our Real President: #DonaldJTrump @Potus45 ?? #Trump Won Election Handedly. @DNC @SpeakerPelosi #BiasedMSM & Fixed #Dominion Voting Machines Cheated & Placed @JoeBiden @KamalaHarris in #WH. Hold them all Accountable. @nypost @WSJ @OANN @FoxNation https://t.co/LnyUmbrRW1 -------------------------------------------------- TRUMP cleaned -------------------------------------------------- wheres our real president donald jtrump potus trump won election handedly dnc speaker pelosi biased msm amp fixed dominion voting machines cheated amp placed joe biden kamala harris in wh hold them all accountable nypost wsj oann fox nation -------------------------------------------------- BIDEN original -------------------------------------------------- @HouseGOP …and #Biden calls the 💀 dead, heroes⁉️ REALLY⁉️ Or is Biden the big #Zero‼️ -0- -------------------------------------------------- BIDEN cleaned -------------------------------------------------- house gop and biden calls the dead heroes really or is biden the big zero -------------------------------------------------- TRUMP original -------------------------------------------------- @gop this is why there are #ISIS in #Afghanistan because of #Trump corruption and incompetence https://t.co/j2rX7edOTG -------------------------------------------------- TRUMP cleaned -------------------------------------------------- gop this is why there are isis in afghanistan because of trump corruption and incompetence
Then the rows were examined by detecting language for each row.
biden_clean["Language"] = 'NA'
biden_clean["Translation"] = 'NA'
trump_clean["Language"] = 'NA'
trump_clean["Translation"] = 'NA'
for i in range(0, len(biden_clean)):
language = detect(biden_clean['Text'][i])
biden_clean['Language'][i] = language
for i in range(0, len(trump_clean)):
language = detect(trump_clean['Text'][i])
trump_clean['Language'][i] = language
#biden_clean.to_csv("Biden_clean.csv", encoding = 'utf-8', index=False)
#trump_clean.to_csv("Trump_clean.csv", encoding = 'utf-8', index=False)
As a consequence of scraping the data in the middle of the month, the rest of the days (from 27.08.2021 until 01.09.2021) were joined to the original datasets.
biden_clean = pd.read_csv("Biden_clean.csv", sep = ',', engine = 'python')
trump_clean = pd.read_csv("Trump_clean.csv", sep = ',', engine = 'python')
biden_clean["Datetime"] = pd.to_datetime(biden_clean["Datetime"], format = '%Y-%m-%d %H:%M:%S%z')
trump_clean["Datetime"] = pd.to_datetime(trump_clean["Datetime"], format = '%Y-%m-%d %H:%M:%S%z')
biden_clean_since_27 = pd.read_csv("Biden_clean_since_27_08_2021.csv", sep = ',', engine = 'python')
trump_clean_since_27 = pd.read_csv("Trump_clean_since_27_08_2021.csv", sep = ',', engine = 'python')
biden_clean_since_27["Datetime"] = pd.to_datetime(biden_clean_since_27["Datetime"], format = '%Y-%m-%d %H:%M:%S%z')
trump_clean_since_27["Datetime"] = pd.to_datetime(trump_clean_since_27["Datetime"], format = '%Y-%m-%d %H:%M:%S%z')
biden_clean = biden_clean_since_27.append(biden_clean, ignore_index = True, sort = False)
trump_clean = trump_clean_since_27.append(trump_clean, ignore_index = True, sort = False)
For calculating polarity and subjectivity, TextBlob library was used. However, the library is prepared to be called only for English texts. As a consequence, the non-English tweets had to be translated priorly.
pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity
Due to the quite massive numbers of tweets in datasets, I decided to translate only a few of the most popular + Polish as my native language. Amid the most common, English took 1st place, Italian 2nd and Spanish or German 3rd. Taking into account mistakes which are made by used language detector and similarity between Italian and Spanish, I chose English (tremendous share), Spanish, German, French (Europe languages, similar, but noticeably different) and Polish (native language + curiosity).
The following language's analysis is based on determining polarity and subjectivity, and if needed also on prior translation. Analysis was extended by Word Clouds for English and Polish.
biden_l = biden_clean['Language'].value_counts()
trump_l = trump_clean['Language'].value_counts()
print('Biden top 6 languages acc. to no. of tweets:\n')
print(biden_l[:6])
print('\nTrump top 6 languages acc. to no. of tweets:\n')
print(trump_l[:6])
Biden top 6 languages acc. to no. of tweets: en 101483 it 8306 es 6453 nl 6219 fr 4688 de 3338 Name: Language, dtype: int64 Trump top 6 languages acc. to no. of tweets: en 77528 it 2497 de 2482 es 2354 fr 2269 ro 1777 Name: Language, dtype: int64
biden_en = biden_clean[biden_clean['Language'].isin(["en"])]
trump_en = trump_clean[trump_clean['Language'].isin(["en"])]
print("ENGLISH tweets in Biden's tweets: ", len(biden_en)/len(biden_clean),
"\nENGLISH tweets in Trump's tweets: ", len(trump_en)/len(trump_clean))
ENGLISH tweets in Biden's tweets: 0.6989861281390768 ENGLISH tweets in Trump's tweets: 0.8087796532370799
biden_en.loc[:, "Polarity"] = biden_en['Text'].apply(pol)
biden_en.loc[:, "Subjectivity"] = biden_en['Text'].apply(sub)
trump_en.loc[:, "Polarity"] = trump_en['Text'].apply(pol)
trump_en.loc[:, "Subjectivity"] = trump_en['Text'].apply(sub)
ProfileReport(biden_en, title="Biden English Tweets | Pandas Profiling Report",
missing_diagrams=None, duplicates=None)
ProfileReport(trump_en, title="Trump English Tweets | Pandas Profiling Report",
missing_diagrams=None, duplicates=None)
#biden_en.to_csv("Biden_en.csv", encoding = 'utf-8', index = False)
#trump_en.to_csv("Trump_en.csv", encoding = 'utf-8', index = False)
d_parser = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S%z')
biden_en = pd.read_csv("Biden_en.csv", sep = ',', engine = 'python', parse_dates = ['Datetime'], date_parser = d_parser)
trump_en = pd.read_csv("Trump_en.csv", sep = ',', engine = 'python', parse_dates = ['Datetime'], date_parser = d_parser)
biden_en.set_index('Datetime', inplace = True)
trump_en.set_index('Datetime', inplace = True)
To create a Word Cloud, there is the necessity to determine the most popular words and their frequencies. To make clouds more meaningful, it is also a good practice to remove common words (called stop words).
biden_text = biden_en['Text'].str.cat(sep = ' ')
trump_text = trump_en['Text'].str.cat(sep = ' ')
president = [biden_text, trump_text]
tweets = []
for i in president:
cv_president = CountVectorizer(stop_words = "english")
input = [i]
president_cv = cv_president.fit_transform(input)
row = pd.DataFrame(president_cv.toarray(), columns = cv_president.get_feature_names())
tweets.append(row)
president = []
for i in tweets:
president.append(i)
tweets = pd.concat(president, sort = True)
tweets = tweets.set_axis(["Biden", "Trump"])
tweets = tweets.replace(np.nan, 0)
tweets = tweets.astype(int)
tweets
| aa | aaa | aaaaaaaaaahhh | aaaaaaattt | aaaaand | aaaand | aaaannd | aaand | aaaye | aabbasi | ... | 𝙛𝙞𝙧𝙢𝙡𝙮 | 𝙛𝙧𝙤𝙢 | 𝙝𝙚 | 𝙢𝙚𝙙𝙞𝙖 | 𝙤𝙣 | 𝙥𝙧𝙤𝙗𝙡𝙚𝙢 | 𝙨𝙤𝙘𝙞𝙖𝙡 | 𝙩𝙚𝙖𝙢 | 𝙩𝙪𝙧𝙣𝙨 | 𝙫𝙖𝙘𝙖𝙩𝙞𝙤𝙣 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Biden | 16 | 4 | 0 | 1 | 2 | 4 | 0 | 2 | 1 | 0 | ... | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| Trump | 16 | 3 | 1 | 0 | 1 | 3 | 1 | 0 | 0 | 1 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 rows × 101814 columns
tweets = tweets.transpose()
I was not fully satisfied with the quality of determining stop words, so based on biden_dict and trump_dict, I added my own. Grabbed words sorted descending according to a number of occurrences were printed below.
tokens_with_sw = (["didnt", "time", "going", "just", "years", "say", "like", "doesnt", "really", "did",
"gop", "maga", "right", "dont", "news", "people", "twitter", "today", "don", "day",
"says", "im", "think", "know", "does", "let", "speech", "want", "story", "big",
"better", "man", "women", "wrong", "bidens", "trumps", "cnn", "media", "country", "radio",
"rally", "trying", "make", "house", "days", "hands", "cuomo", "thing", "look", "good", "guy",
"world", "way", "said", "family", "stop", "great", "real", "new", "press", "amp"])
biden_dict = {}
top = tweets["Biden"].sort_values(ascending=False)
biden_dict = dict(zip(top.index.tolist(), top.values))
delete = tokens_with_sw
for i in range(0, len(delete)):
if delete[i] in biden_dict:
del biden_dict[delete[i]]
#biden_dict
dict(list(biden_dict.items())[:10])
{'biden': 143106,
'afghanistan': 37219,
'trump': 21452,
'taliban': 17993,
'joe': 11517,
'kabul': 11183,
'president': 10198,
'america': 8393,
'usa': 7285,
'potus': 6794}
trump_dict = {}
top = tweets["Trump"].sort_values(ascending=False)
trump_dict = dict(zip(top.index.tolist(), top.values))
delete = tokens_with_sw
for i in range(0, len(delete)):
if delete[i] in trump_dict:
del trump_dict[delete[i]]
#trump_dict
dict(list(trump_dict.items())[:10])
{'trump': 98669,
'biden': 27296,
'afghanistan': 11835,
'taliban': 11613,
'president': 6541,
'america': 6357,
'usa': 5420,
'joe': 5279,
'donald': 4326,
'republicans': 4212}
Amid the most popular words, in addition to "biden" or "trump", there were also terms related to Afghanistan like: "taliban", "afghan", "afghanistan", "kabul" or "airport". Some words were not connected with current political situation at all (e.g. "border", "covid", "vaccine") and some were ambiguous: "help", "disaster", "left", "failure". In both word clouds, the words "biden" and "trump" occurred often.
wc = WordCloud(background_color = "white", colormap = "Dark2", width = 800, height = 400, max_words = 200)
plt.rcParams['figure.figsize'] = [32, 12]
president = {"president": ["Biden", "Trump"],
"text": [biden_en, trump_en]}
presidents = [biden_dict, trump_dict]
for index in range(0, len(presidents)):
wc.generate_from_frequencies(presidents[index])
plt.subplot(1, 2, index+1)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title(president['president'][index], fontsize=25)
plt.show()
The following 3 analyses were done similarly - translation - polarity and subjectivity and presidents' profiles. So I am not going to focus on that.
biden_es = biden_clean[biden_clean['Language'].isin(["es"])]
biden_es = biden_es.reset_index()
biden_es = biden_es.drop(['index'], axis = 1)
trump_es = trump_clean[trump_clean['Language'].isin(["es"])]
trump_es = trump_es.reset_index()
trump_es = trump_es.drop(['index'], axis = 1)
print("SPANISH tweets in Biden's tweets: ", len(biden_es)/len(biden_clean),
"\nSPANISH tweets in Trump's tweets: ", len(trump_es)/len(trump_clean))
SPANISH tweets in Biden's tweets: 0.044446434229195655 SPANISH tweets in Trump's tweets: 0.024557157462079326
tweets_translations = []
for i in range(0, len(biden_es)):
tweets_translations.append(google_translator().translate(biden_es['Text'][i], lang_tgt='en'))
translations_biden = pd.DataFrame(tweets_translations)
biden_es['Translation'] = translations_biden.values
tweets_translations = []
for i in range(0, len(trump_es)):
tweets_translations.append(google_translator().translate(trump_es['Text'][i], lang_tgt='en'))
translations_trump = pd.DataFrame(tweets_translations)
trump_es['Translation'] = translations_trump.values
biden_es.loc[:, "Polarity"] = biden_es['Translation'].apply(pol)
biden_es.loc[:, "Subjectivity"] = biden_es['Translation'].apply(sub)
trump_es.loc[:, "Polarity"] = trump_es['Translation'].apply(pol)
trump_es.loc[:, "Subjectivity"] = trump_es['Translation'].apply(sub)
ProfileReport(biden_es, title="Biden Spanish Tweets | Pandas Profiling Report",
missing_diagrams=None, duplicates=None)
ProfileReport(trump_es, title="Trump Spanish Tweets | Pandas Profiling Report",
missing_diagrams=None, duplicates=None)
#biden_es.to_csv("Biden_es.csv", encoding = 'utf-8', index = False)
#trump_es.to_csv("Trump_es.csv", encoding = 'utf-8', index = False)
d_parser = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S%z')
biden_es = pd.read_csv("Biden_es.csv", sep = ',', engine = 'python', parse_dates = ['Datetime'], date_parser = d_parser)
trump_es = pd.read_csv("Trump_es.csv", sep = ',', engine = 'python', parse_dates = ['Datetime'], date_parser = d_parser)
biden_es.set_index('Datetime', inplace = True)
trump_es.set_index('Datetime', inplace = True)
biden_de = biden_clean[biden_clean['Language'].isin(["de"])]
biden_de = biden_de.reset_index()
biden_de = biden_de.drop(['index'], axis = 1)
trump_de = trump_clean[trump_clean['Language'].isin(["de"])]
trump_de = trump_de.reset_index()
trump_de = trump_de.drop(['index'], axis = 1)
print("GERMAN tweets in Biden's tweets: ", len(biden_de)/len(biden_clean),
"\nGERMAN tweets in Trump's tweets: ", len(trump_de)/len(trump_clean))
GERMAN tweets in Biden's tweets: 0.022991197498381386 GERMAN tweets in Trump's tweets: 0.025892465939201737
tweets_translations = []
for i in range(0, len(biden_de)):
tweets_translations.append(google_translator().translate(biden_de['Text'][i], lang_tgt='en'))
translations_biden = pd.DataFrame(tweets_translations)
biden_de['Translation'] = translations_biden.values
tweets_translations = []
for i in range(0, len(trump_de)):
tweets_translations.append(google_translator().translate(trump_de['Text'][i], lang_tgt='en'))
translations_trump = pd.DataFrame(tweets_translations)
trump_de['Translation'] = translations_trump.values
biden_de.loc[:, "Polarity"] = biden_de['Translation'].apply(pol)
biden_de.loc[:, "Subjectivity"] = biden_de['Translation'].apply(sub)
trump_de.loc[:, "Polarity"] = trump_de['Translation'].apply(pol)
trump_de.loc[:, "Subjectivity"] = trump_de['Translation'].apply(sub)
ProfileReport(biden_de, title="Biden German Tweets | Pandas Profiling Report",
missing_diagrams=None, duplicates=None)
ProfileReport(trump_de, title="Trump German Tweets | Pandas Profiling Report",
missing_diagrams=None, duplicates=None)
#biden_de.to_csv("Biden_de.csv", encoding = 'utf-8', index = False)
#trump_de.to_csv("Trump_de.csv", encoding = 'utf-8', index = False)
d_parser = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S%z')
biden_de = pd.read_csv("Biden_de.csv", sep = ',', engine = 'python', parse_dates = ['Datetime'], date_parser = d_parser)
trump_de = pd.read_csv("Trump_de.csv", sep = ',', engine = 'python', parse_dates = ['Datetime'], date_parser = d_parser)
biden_de.set_index('Datetime', inplace = True)
trump_de.set_index('Datetime', inplace = True)
biden_fr = biden_clean[biden_clean['Language'].isin(["fr"])]
biden_fr = biden_fr.reset_index()
biden_fr = biden_fr.drop(['index'], axis = 1)
trump_fr = trump_clean[trump_clean['Language'].isin(["fr"])]
trump_fr = trump_fr.reset_index()
trump_fr = trump_fr.drop(['index'], axis = 1)
print("FRENCH tweets in Biden's tweets: ", len(biden_fr)/len(biden_clean),
"\nFRENCH tweets in Trump's tweets: ", len(trump_fr)/len(trump_clean))
FRENCH tweets in Biden's tweets: 0.032289614701141986 FRENCH tweets in Trump's tweets: 0.023670429176490225
tweets_translations = []
for i in range(0, len(biden_fr)):
tweets_translations.append(google_translator().translate(biden_fr['Text'][i], lang_tgt='en'))
translations_biden = pd.DataFrame(tweets_translations)
biden_fr['Translation'] = translations_biden.values
tweets_translations = []
for i in range(0, len(trump_fr)):
tweets_translations.append(google_translator().translate(trump_fr['Text'][i], lang_tgt='en'))
translations_trump = pd.DataFrame(tweets_translations)
trump_fr['Translation'] = translations_trump.values
biden_fr.loc[:, "Polarity"] = biden_fr['Translation'].apply(pol)
biden_fr.loc[:, "Subjectivity"] = biden_fr['Translation'].apply(sub)
trump_fr.loc[:, "Polarity"] = trump_fr['Translation'].apply(pol)
trump_fr.loc[:, "Subjectivity"] = trump_fr['Translation'].apply(sub)
ProfileReport(biden_fr, title="Biden French Tweets | Pandas Profiling Report",
missing_diagrams=None, duplicates=None)
ProfileReport(biden_fr, title="Trump French Tweets | Pandas Profiling Report",
missing_diagrams=None, duplicates=None)
#biden_fr.to_csv("Biden_fr.csv", encoding = 'utf-8', index = False)
#trump_fr.to_csv("Trump_fr.csv", encoding = 'utf-8', index = False)
d_parser = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S%z')
biden_fr = pd.read_csv("Biden_fr.csv", sep = ',', engine = 'python', parse_dates = ['Datetime'], date_parser = d_parser)
trump_fr = pd.read_csv("Trump_fr.csv", sep = ',', engine = 'python', parse_dates = ['Datetime'], date_parser = d_parser)
biden_fr.set_index('Datetime', inplace = True)
trump_fr.set_index('Datetime', inplace = True)
For the Polish language, steps similar to the English language have been taken.
biden_pl = biden_clean[biden_clean['Language'].isin(["pl"])]
biden_pl = biden_pl.reset_index()
biden_pl = biden_pl.drop(['index'], axis = 1)
trump_pl = trump_clean[trump_clean['Language'].isin(["pl"])]
trump_pl = trump_pl.reset_index()
trump_pl = trump_pl.drop(['index'], axis = 1)
print("POLISH tweets in Biden's tweets: ", len(biden_pl)/len(biden_clean),
"\nPOLISH tweets in Trump's tweets: ", len(trump_pl)/len(trump_clean))
POLISH tweets in Biden's tweets: 0.0042497210474839175 POLISH tweets in Trump's tweets: 0.0023576540299192557
tweets_translations = []
for i in range(0, len(biden_pl)):
tweets_translations.append(google_translator().translate(biden_pl['Text'][i], lang_tgt='en'))
translations_biden = pd.DataFrame(tweets_translations)
biden_pl['Translation'] = translations_biden.values
tweets_translations = []
for i in range(0, len(trump_pl)):
tweets_translations.append(google_translator().translate(trump_pl['Text'][i], lang_tgt='en'))
translations_trump = pd.DataFrame(tweets_translations)
trump_pl['Translation'] = translations_trump.values
biden_pl.loc[:, "Polarity"] = biden_pl['Translation'].apply(pol)
biden_pl.loc[:, "Subjectivity"] = biden_pl['Translation'].apply(sub)
trump_pl.loc[:, "Polarity"] = trump_pl['Translation'].apply(pol)
trump_pl.loc[:, "Subjectivity"] = trump_pl['Translation'].apply(sub)
ProfileReport(biden_pl, title="Biden Polish Tweets | Pandas Profiling Report",
missing_diagrams=None, duplicates=None)
ProfileReport(biden_pl, title="Trump Polish Tweets | Pandas Profiling Report",
missing_diagrams=None, duplicates=None)
#biden_pl.to_csv("Biden_pl.csv", encoding = 'utf-8', index = False)
#trump_pl.to_csv("Trump_pl.csv", encoding = 'utf-8', index = False)
d_parser = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S%z')
biden_pl = pd.read_csv("Biden_pl.csv", sep = ',', engine = 'python', parse_dates = ['Datetime'], date_parser = d_parser)
trump_pl = pd.read_csv("Trump_pl.csv", sep = ',', engine = 'python', parse_dates = ['Datetime'], date_parser = d_parser)
biden_pl.set_index('Datetime', inplace = True)
trump_pl.set_index('Datetime', inplace = True)
biden_text = biden_pl['Text'].str.cat(sep = ' ')
trump_text = trump_pl['Text'].str.cat(sep = ' ')
president = [biden_text, trump_text]
tweets = []
for i in president:
cv_president = CountVectorizer()
input = [i]
president_cv = cv_president.fit_transform(input)
row = pd.DataFrame(president_cv.toarray(), columns = cv_president.get_feature_names())
tweets.append(row)
president = []
for i in tweets:
president.append(i)
tweets = pd.concat(president, sort = True)
tweets = tweets.set_axis(["Biden", "Trump"])
tweets = tweets.replace(np.nan, 0)
tweets = tweets.astype(int)
tweets
| aaa | ab | abc | abdula | aborcji | aborcję | abrams | absolutnie | aby | acosta | ... | żydów | żyje | żyjemy | żykistanu | żyliście | żywego | żywo | żyć | żył | żyłbiden | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Biden | 1 | 3 | 0 | 0 | 1 | 1 | 2 | 2 | 9 | 0 | ... | 1 | 2 | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 1 |
| Trump | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 2 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 |
2 rows × 5783 columns
tweets = tweets.transpose()
tokens_with_sw = (["nie", "to", "na", "się", "że", "do", "jest", "co", "za", "jak",
"ma", "po", "już", "tak", "czy", "od", "tego", "tym", "dla", "ale",
"przez", "będzie", "tylko", "teraz", "jego", "ów", "bo", "też", "są", "by",
"było", "ze", "chcą", "niż", "nord", "stream", "być", "więcej", "nawet", "będą",
"on", "mówi", "bez", "lat", "nas", "nigdy", "sobie", "niech", "świat", "ich", "bia",
"nadal", "której", "ńczycy", "wszystkich", "nic", "sprawie", "jeśli", "tam", "gdzie", "gdy",
"media", "przed", "jeszcze", "ten", "pod", "mu", "pytanie", "cały", "może", "czyli",
"był", "jednak", "bardzo", "wsch", "który", "pierwsz", "ta", "is", "bliski", "czasy",
"si", "us", "można", "no", "powiedział", "mnie", "jaki", "pl", "ód", "twierdzi", "the",
"twitter", "gdybyby", "były", "now", "ły", "jego", "wszystko", "jako", "gdyby", "we", "wi",
"które", "zrobić", "podjeli", "kto", "powinni", "świecie", "aby", "byla", "właśnie", "mam",
"tych", "mówił", "podjął", "wiadomosci", "bweglarczyk"])
biden_dict = {}
top = tweets["Biden"].sort_values(ascending=False)
biden_dict = dict(zip(top.index.tolist(), top.values))
delete = tokens_with_sw
for i in range(0, len(delete)):
if delete[i] in biden_dict:
del biden_dict[delete[i]]
#biden_dict
dict(list(biden_dict.items())[:10])
{'biden': 637,
'usa': 197,
'afganistan': 149,
'joe': 102,
'kabul': 72,
'bidena': 69,
'prezydent': 64,
'trump': 60,
'afganistanu': 51,
'taliban': 45}
trump_dict = {}
top = tweets["Trump"].sort_values(ascending=False)
trump_dict = dict(zip(top.index.tolist(), top.values))
delete = tokens_with_sw
for i in range(0, len(delete)):
if delete[i] in trump_dict:
del trump_dict[delete[i]]
#trump_dict
dict(list(trump_dict.items())[:10])
{'trump': 249,
'biden': 73,
'usa': 64,
'afganistan': 47,
'donald': 38,
'taliban': 28,
'joe': 27,
'prezydent': 26,
'afganistanu': 17,
'trumpa': 17}
Similar to English, tweets in Polish covered similar "areas" (including tweets related to Afghanistan). Additionally, there were words related to the situation in Poland, e.g. "duda", "lgbt", "onet".
wc = WordCloud(background_color = "white", colormap = "Dark2", width = 800, height = 400, max_words = 200)
plt.rcParams['figure.figsize'] = [32, 12]
president = {"president": ["Biden", "Trump"],
"text": [biden_pl, trump_pl]}
presidents = [biden_dict, trump_dict]
for index in range(0, len(presidents)):
wc.generate_from_frequencies(presidents[index])
plt.subplot(1, 2, index+1)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title(president['president'][index], fontsize=25)
plt.show()
The final step of my project was data visualization!
Datasets for this part consisted of tweets which languages were analyzed priorly, i.e. English, Spanish, German, French and Polish.
biden = pd.concat([biden_en, biden_es, biden_de, biden_fr, biden_pl], axis = 'rows')
biden = biden.reset_index()
trump = pd.concat([trump_en, trump_es, trump_de, trump_fr, trump_pl], axis = 'rows')
trump = trump.reset_index()
plt.rcParams['figure.figsize'] = [8.0, 6.0]
plt.rcParams['figure.dpi'] = 80
plt.rcParams['savefig.dpi'] = 100
plt.rcParams['font.size'] = 12
plt.rcParams['legend.fontsize'] = 'large'
plt.rcParams['figure.titlesize'] = 'medium'
Firstly, I checked whether it is a clear difference between the days and number of Biden's tweets. For all languages, two dominant areas can be observed. The first one (higher bar approximately 15.08.2021) concerns the situation in Afghanistan - the withdrawal of troops and the Taliban's seizure of power in Afghanistan. The second one (lower bar approximately 26.08.2021) could concern Biden speech about the Kabul airport attack, which killed a dozen American troops, Biden confirmation of troops evacuation till 31.08.2021 or other issues which were addressed at the 25.08.2021 conference.
plt.style.use("seaborn")
biden['Datetime'].hist(bins=100, by = biden['Language'], xrot = 22.5, xlabelsize=14, ylabelsize=14, figsize=(20,8))
array([[<AxesSubplot:title={'center':'de'}>,
<AxesSubplot:title={'center':'en'}>],
[<AxesSubplot:title={'center':'es'}>,
<AxesSubplot:title={'center':'fr'}>],
[<AxesSubplot:title={'center':'pl'}>, <AxesSubplot:>]],
dtype=object)
Similar (but not for all languages) observations can be spotted in Trump's dataset.
trump['Datetime'].hist(bins=100, by = trump['Language'], xrot = 22.5, xlabelsize=14, ylabelsize=14, figsize=(20,8))
array([[<AxesSubplot:title={'center':'de'}>,
<AxesSubplot:title={'center':'en'}>],
[<AxesSubplot:title={'center':'es'}>,
<AxesSubplot:title={'center':'fr'}>],
[<AxesSubplot:title={'center':'pl'}>, <AxesSubplot:>]],
dtype=object)
Looking at Polarity and Subjectivity in Biden and Trump's datasets, a preponderance of neutral tweets (0 - Polarity) can be observed for both presidents. The number of tweets for Polarity decreases smoothly (without including point 0.0). When it comes to Subjectivity, the highest number of tweets can be spotted for point 0.0 (lack of Subjectivity). The second-largest point goes to point 0.5, and since which the number of tweets flattens.
biden[['Polarity', 'Subjectivity']].hist(bins=25, by = biden['Language'], xlabelsize=14, ylabelsize=14,
legend = True, figsize=(20,10))
array([[<AxesSubplot:title={'center':'de'}>,
<AxesSubplot:title={'center':'en'}>],
[<AxesSubplot:title={'center':'es'}>,
<AxesSubplot:title={'center':'fr'}>],
[<AxesSubplot:title={'center':'pl'}>, <AxesSubplot:>]],
dtype=object)
trump[['Polarity', 'Subjectivity']].hist(bins=25, by = trump['Language'], xlabelsize=14, ylabelsize=14,
legend = True, figsize=(20,10))
array([[<AxesSubplot:title={'center':'de'}>,
<AxesSubplot:title={'center':'en'}>],
[<AxesSubplot:title={'center':'es'}>,
<AxesSubplot:title={'center':'fr'}>],
[<AxesSubplot:title={'center':'pl'}>, <AxesSubplot:>]],
dtype=object)
Finally, the data were grouped according to hours, and the means for polarity and subjectivity were calculated. For different languages, different thresholds were set:
biden_plot_en = biden_en[['Polarity', 'Subjectivity']].resample('6H').mean()
biden_plot_en['Date'] = biden_plot_en.index
trump_plot_en = trump_en[['Polarity', 'Subjectivity']].resample('6H').mean()
trump_plot_en['Date'] = trump_plot_en.index
biden_plot_es = biden_es[['Polarity', 'Subjectivity']].resample('12H').mean()
biden_plot_es['Date'] = biden_plot_es.index
trump_plot_es = trump_es[['Polarity', 'Subjectivity']].resample('12H').mean()
trump_plot_es['Date'] = trump_plot_es.index
biden_plot_de = biden_de[['Polarity', 'Subjectivity']].resample('12H').mean()
biden_plot_de['Date'] = biden_plot_de.index
trump_plot_de = trump_de[['Polarity', 'Subjectivity']].resample('12H').mean()
trump_plot_de['Date'] = trump_plot_de.index
biden_plot_fr = biden_fr[['Polarity', 'Subjectivity']].resample('12H').mean()
biden_plot_fr['Date'] = biden_plot_fr.index
trump_plot_fr = trump_fr[['Polarity', 'Subjectivity']].resample('12H').mean()
trump_plot_fr['Date'] = trump_plot_fr.index
biden_plot_pl = biden_pl[['Polarity', 'Subjectivity']].resample('24H').mean()
biden_plot_pl['Date'] = biden_plot_pl.index
trump_plot_pl = trump_pl[['Polarity', 'Subjectivity']].resample('24H').mean()
trump_plot_pl['Date'] = trump_plot_pl.index
And the plots - polarity and subjectivity according to date were created.
As can be seen, the English tweets were characterized by low variance in comparison to other languages. English tweets were fluctuating around 0.0 - 0.1 for polarity and around 0.3 - 0.4 for subjectivity. The visible difference could be caused by tremendously higher share English tweets in tweets, change in meaning caused by translation and bypassing high polarity and subjectivity data in initial datasets.
dfs = [biden_plot_en, trump_plot_en, biden_plot_es, trump_plot_es, biden_plot_de, trump_plot_de,
biden_plot_fr, trump_plot_fr, biden_plot_pl, trump_plot_pl]
names = ['Biden polarity', 'Trump polarity', 'Biden subjectivity', 'Trump subjectivity']
pol_sub = ['Polarity', 'Subjectivity']
plt.figure(figsize=(24,32))
k = 0
for i in range(0, 2):
for j in range(0, 2):
plt.style.use("fivethirtyeight")
plt.subplot(4, 1, k+1)
plt.plot(dfs[j]['Date'], dfs[j][pol_sub[i]], label = 'en', linewidth = 4)
plt.plot(dfs[j+2]['Date'], dfs[j+2][pol_sub[i]], label = 'es', linewidth = 2)
plt.plot(dfs[j+4]['Date'], dfs[j+4][pol_sub[i]], label = 'de', linewidth = 2)
plt.plot(dfs[j+6]['Date'], dfs[j+6][pol_sub[i]], label = 'fr', linewidth = 2)
plt.plot(dfs[j+8]['Date'], dfs[j+8][pol_sub[i]], label = 'pl', linewidth = 2)
plt.xticks(size=14)
plt.yticks(size=14)
label = pol_sub[i]
plt.ylabel(label, size = 16)
plt.legend(loc=2, fontsize = 'large')
name = names[k]
plt.title(name, size = 20)
k += 1
plt.show()
Comparing datasets at one graph for each language revealed a more insightful view of datasets. For nearly all languages, a substantial polarity decrease for Biden after 11.08.2021 (withdrawing US forces from Afghanistan) could be seen. That relation is clearly visible for the English dataset.
names = ['English', 'English', 'Spanish', 'Spanish', 'German', 'German', 'French', 'French', 'Polish', 'Polish']
plt.figure(figsize=(24,60))
k = 0
for i in range(0, len(dfs), 2):
for j in range(0, 2):
plt.subplot(10, 1, k+1)
plt.plot(dfs[i]['Date'], dfs[i][pol_sub[j]], label = 'Biden', linewidth = 2.5)
plt.plot(dfs[i+1]['Date'], dfs[i+1][pol_sub[j]], label = 'Trump', linewidth = 2.5)
plt.xticks(size=14)
plt.yticks(size=14)
label = pol_sub[j]
plt.ylabel(label, size = 16)
plt.legend(loc=2, fontsize = 'x-large')
name = pol_sub[j] + ' in ' + names[i] + ' tweets'
plt.title(name, size = 20)
k += 1
plt.show()